{
 "cells": [
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "# Importações",
   "id": "9cc7a4c844a5230e"
  },
  {
   "cell_type": "code",
   "id": "initial_id",
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
     "end_time": "2025-05-04T21:38:00.439628Z",
     "start_time": "2025-05-04T21:38:00.415950Z"
    }
   },
   "source": [
    "import pandas as pd\n",
    "import datetime\n",
    "import locale\n",
    "import glob\n",
    "import os\n",
    "\n",
    "locale.setlocale(locale.LC_ALL, 'pt_BR.UTF-8')\n"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'pt_BR.UTF-8'"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 1
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "# Carregar datasets básicos",
   "id": "72f220b34ac4c7bb"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-05-04T21:38:00.867689Z",
     "start_time": "2025-05-04T21:38:00.453910Z"
    }
   },
   "cell_type": "code",
   "source": [
    "consumo = pd.read_csv('./dados/consumo.csv', sep=\";\", header=0)\n",
    "consumo = consumo.query(\"CAMPUS not in ['LONDRINA - NORTE', 'ARAPONGAS', 'COLOMBO']\")\n",
    "\n",
    "consumo[\"DATA INICIO\"] = pd.to_datetime(consumo['DATA INICIO'], format='%d/%m/%Y')\n",
    "consumo[\"DATA FIM\"] = pd.to_datetime(consumo['DATA FIM'], format='%d/%m/%Y')\n",
    "\n",
    "feriados = pd.read_csv('dados/calendarios.csv', sep=\";\", header=0)\n",
    "feriados = feriados.query(\"CAMPUS not in ['LONDRINA - NORTE', 'ARAPONGAS', 'COLOMBO']\")\n",
    "\n",
    "cursos_sup = pd.read_csv('./dados/cursos_superiores.csv', sep=\";\", header=0)\n",
    "cursos_sup = cursos_sup.query(\"CAMPUS not in ['LONDRINA - NORTE', 'ARAPONGAS', 'COLOMBO']\")\n",
    "\n",
    "cursos_tec = pd.read_csv('./dados/cursos_tecnicos.csv', sep=\";\", header=0)\n",
    "cursos_tec = cursos_tec.query(\"CAMPUS not in ['LONDRINA - NORTE', 'ARAPONGAS', 'COLOMBO']\")\n"
   ],
   "id": "eef2d87d2ab50d81",
   "outputs": [],
   "execution_count": 2
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "# Agrupar Datasets\n",
   "id": "e8c1b6d47fd60663"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-05-04T21:38:01.111542Z",
     "start_time": "2025-05-04T21:38:01.098295Z"
    }
   },
   "cell_type": "code",
   "source": [
    "\n",
    "# Agrupando os dados por campus \n",
    "grupos_consumo = consumo.groupby('CAMPUS')\n",
    "grupos_feriados = feriados.groupby('CAMPUS')\n",
    "grupos_cursos_sup = cursos_sup.query('Modalidade == \"Educação Presencial\"').groupby('CAMPUS')\n",
    "grupos_cursos_tec = cursos_tec.query('NO_MODALIDADE == \"EDUCAÇÃO PRESENCIAL\"').groupby('CAMPUS')\n",
    "\n",
    "# Criando subdatasets para cada campus \n",
    "subdf_consumo = {campus: dados for campus, dados in grupos_consumo}\n",
    "subdf_feriados = {campus: dados for campus, dados in grupos_feriados}\n",
    "subdf_cursos_sup = {campus: dados for campus, dados in grupos_cursos_sup}\n",
    "subdf_cursos_tec = {campus: dados for campus, dados in grupos_cursos_tec}\n",
    "\n"
   ],
   "id": "17009c1c04179966",
   "outputs": [],
   "execution_count": 3
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "# Transformar Dados de Consumo Mensais em Diários",
   "id": "89f0e6dc45361d31"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-05-04T21:38:02.884135Z",
     "start_time": "2025-05-04T21:38:01.154557Z"
    }
   },
   "cell_type": "code",
   "source": [
    "dataframes = []\n",
    "dataframes_ignorados = []\n",
    "for campus, df_consumo in subdf_consumo.items():\n",
    "    dt_min = subdf_consumo[campus][\"DATA INICIO\"].min()\n",
    "    dt_max = subdf_consumo[campus][\"DATA FIM\"].max()\n",
    "\n",
    "    if dt_min.day != 1:\n",
    "        next_month = (dt_min.replace(day=28) + pd.DateOffset(days=4)).replace(day=1)\n",
    "        dt_min = next_month\n",
    "\n",
    "    if dt_max != (dt_max + pd.offsets.MonthEnd(0)):\n",
    "        dt_max = dt_max + pd.offsets.MonthEnd(-1)\n",
    "\n",
    "    datas = pd.date_range(start=dt_min, end=dt_max)\n",
    "    df = pd.DataFrame({'DATA': datas})\n",
    "    df[\"CAMPUS\"] = campus\n",
    "\n",
    "    for index, row in df_consumo.iterrows():\n",
    "        dt_inicio = row['DATA INICIO']\n",
    "        dt_fim = row['DATA FIM']\n",
    "        nr_dias = (dt_fim - dt_inicio).days + 1\n",
    "\n",
    "        mascara = (df['DATA'] >= pd.to_datetime(dt_inicio)) & (df['DATA'] <= pd.to_datetime(dt_fim))\n",
    "        df.loc[mascara, \"CONSUMO\"] = row[\"CONSUMO\"] / nr_dias\n",
    "\n",
    "    df[\"CONSUMO\"] = df[\"CONSUMO\"].round()\n",
    "    if \"PARANAGUÁ\" in campus:\n",
    "        dataframes_ignorados.append(df)\n",
    "    else:\n",
    "        dataframes.append(df)\n",
    "\n",
    "df_merged = pd.merge(dataframes_ignorados[0], dataframes_ignorados[1], on='DATA', how='inner',\n",
    "                     suffixes=('_df1', '_df2'))\n",
    "df_merged[\"CONSUMO\"] = df_merged[\"CONSUMO_df1\"].fillna(0) + df_merged[\"CONSUMO_df2\"].fillna(0)\n",
    "df_merged[\"CAMPUS\"] = \"PARANAGUÁ\"\n",
    "df_merged = df_merged.drop([\"CONSUMO_df1\", \"CONSUMO_df2\", \"CAMPUS_df1\", \"CAMPUS_df2\"], axis=1)\n",
    "dataframes.append(df_merged)\n",
    "\n",
    "df_consumo_diarios = pd.concat(dataframes, ignore_index=True)\n",
    "\n",
    "df_consumo_diarios.to_csv(\"./dados/dados_consumo_diario.csv\", sep=\";\", decimal=\".\", index=False)\n",
    "\n",
    "grupos_consumo_diarios = df_consumo_diarios.groupby('CAMPUS')\n",
    "subdf_consumo_diarios = {campus: dados for campus, dados in grupos_consumo_diarios}\n"
   ],
   "id": "6f8cf2d9ae48012e",
   "outputs": [],
   "execution_count": 4
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "# Ajustar Dados Climáticos",
   "id": "54fb0dbd119c50b5"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-05-04T21:38:03.693995Z",
     "start_time": "2025-05-04T21:38:02.948373Z"
    }
   },
   "cell_type": "code",
   "source": [
    "df_clima = pd.read_csv(\"./dados/clima.csv\", sep=\";\", header=0, decimal=\".\")\n",
    "df_clima_londrina = df_clima.query('CAMPUS == \"LONDRINA\"')\n",
    "\n",
    "df_clima_londrina.loc[df_clima_londrina['CAMPUS'] == \"LONDRINA\", 'CAMPUS'] = \"LONDRINA - NORTE\"\n",
    "df_clima.loc[df_clima['CAMPUS'] == \"LONDRINA\", 'CAMPUS'] = \"LONDRINA - CENTRO\"\n",
    "\n",
    "df_clima = pd.concat([df_clima, df_clima_londrina], ignore_index=True)\n",
    "\n",
    "df_clima[\"DATA\"] = pd.to_datetime(df_clima[\"DATA\"], format=\"%d/%m/%Y\")\n",
    "\n",
    "df_clima.to_csv(\"./dados/dados_climaticos.csv\", sep=\";\", decimal=\".\", index=False)"
   ],
   "id": "d9601a54bc6682ac",
   "outputs": [],
   "execution_count": 5
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "# Criar Datasets conforme Calendários Acadêmicos",
   "id": "82c927b3472b68c1"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-05-04T21:39:01.172727Z",
     "start_time": "2025-05-04T21:38:03.769412Z"
    }
   },
   "cell_type": "code",
   "source": [
    "\n",
    "for campus, df_feriados in subdf_consumo_diarios.items():\n",
    "    dt_min = pd.to_datetime(subdf_consumo_diarios[campus][\"DATA\"]).min()\n",
    "    dt_max = pd.to_datetime(subdf_consumo_diarios[campus][\"DATA\"]).max()\n",
    "\n",
    "    dt_inicio = datetime.date(year=2010, month=1, day=1)\n",
    "    dt_fim = datetime.date.today()\n",
    "\n",
    "    datas = pd.date_range(start=dt_inicio, end=dt_fim)\n",
    "    df = pd.DataFrame({'DATA': datas})\n",
    "    df[\"CAMPUS\"] = campus\n",
    "\n",
    "    for index, row in df.iterrows():\n",
    "        data = row['DATA'].date()\n",
    "        dia = data.day\n",
    "        mes = data.month\n",
    "        ano = data.year\n",
    "\n",
    "        rows_data_fixa = feriados.loc[feriados[\"DATA INICIO\"] == f\"{dia}/{mes}\"]\n",
    "        rows_data_flexivel = feriados.loc[feriados[\"DATA INICIO\"] == f\"{dia:02}/{mes:02}/{ano:04}\"]\n",
    "\n",
    "        if not rows_data_fixa.empty:\n",
    "            for _, row_data_fixa in rows_data_fixa.iterrows():\n",
    "                if campus in row_data_fixa[\"CAMPUS\"] or row_data_fixa[\"CAMPUS\"] == \"*\":\n",
    "                    [dia_final_fixo, mes_final_fixo] = str(row_data_fixa[\"DATA FIM\"]).split(\"/\")\n",
    "                    data_final_fixa = datetime.date(ano, int(mes_final_fixo), int(dia_final_fixo))\n",
    "                    mascara = (df['DATA'] >= pd.to_datetime(data)) & (df['DATA'] <= pd.to_datetime(data_final_fixa))\n",
    "                    df.loc[mascara, row_data_fixa[\"MOTIVO\"]] = 1\n",
    "\n",
    "        if not rows_data_flexivel.empty:\n",
    "            for _, row_data_flexivel in rows_data_flexivel.iterrows():\n",
    "                if campus in row_data_flexivel[\"CAMPUS\"] or row_data_flexivel[\"CAMPUS\"] == \"*\":\n",
    "                    [dia_final_flexivel, mes_final_flexivel, ano_final_flexivel] = str(\n",
    "                        row_data_flexivel[\"DATA FIM\"]).split(\"/\")\n",
    "                    data_final_flexivel = datetime.date(int(ano_final_flexivel), int(mes_final_flexivel),\n",
    "                                                        int(dia_final_flexivel))\n",
    "                    mascara = (df['DATA'] >= pd.to_datetime(data)) & (df['DATA'] <= pd.to_datetime(data_final_flexivel))\n",
    "                    df.loc[mascara, row_data_flexivel[\"MOTIVO\"]] = 1\n",
    "\n",
    "    df = df.fillna(0)\n",
    "    df = df.loc[(df['DATA'] >= dt_min) & (df['DATA'] <= dt_max)]\n",
    "    df.to_csv(f\"./dados/calendarios/{campus}.csv\", index=False, sep=\";\", decimal=\".\")\n"
   ],
   "id": "1852980aa8390ff0",
   "outputs": [],
   "execution_count": 6
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "# Criar Dataset de Dados Temporais",
   "id": "390c839a01e859c7"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-05-04T21:39:04.431194Z",
     "start_time": "2025-05-04T21:39:01.231335Z"
    }
   },
   "cell_type": "code",
   "source": [
    "pasta = './dados/calendarios/'\n",
    "\n",
    "# Obter uma lista de todos os arquivos .csv na pasta\n",
    "arquivos_csv = glob.glob(os.path.join(pasta, \"*.csv\"))\n",
    "# Criar uma lista para armazenar os DataFrames\n",
    "dataframes = []\n",
    "\n",
    "# Iterar sobre a lista de arquivos .csv e ler cada um em um DataFrame\n",
    "for arquivo in arquivos_csv:\n",
    "    df = pd.read_csv(arquivo, sep=\";\", header=0)\n",
    "    df['CAMPUS'] = os.path.basename(arquivo.replace(\".csv\", \"\"))\n",
    "    df['CAMPUS_'] = df[\"CAMPUS\"]\n",
    "    df['DATA'] = pd.to_datetime(df['DATA'])\n",
    "    df['DIA_DA_SEMANA'] = df['DATA'].dt.strftime('%a')\n",
    "    df['MÊS'] = df['DATA'].dt.strftime('%b')\n",
    "    df['ANO'] = df['DATA'].dt.strftime('%Y')\n",
    "    df = pd.get_dummies(df, columns=['DIA_DA_SEMANA', \"MÊS\", \"ANO\", \"CAMPUS\"])\n",
    "    df = df.rename(columns={'CAMPUS_': 'CAMPUS'})\n",
    "\n",
    "    df = df.fillna(False)\n",
    "    df[df.drop([\"CAMPUS\", \"DATA\"], axis=1).columns] = df[df.drop([\"CAMPUS\", \"DATA\"], axis=1).columns].astype(int)\n",
    "    dataframes.append(df)\n",
    "\n",
    "# Concatenar todos os DataFrames em um único DataFrame\n",
    "df_completo = pd.concat(dataframes, ignore_index=True)\n",
    "df_completo = df_completo.fillna(0)\n",
    "\n",
    "df_completo.to_csv(\"./dados/dados_temporais.csv\", sep=\";\", decimal=\".\", index=False)\n"
   ],
   "id": "38f32a01bfda035",
   "outputs": [],
   "execution_count": 7
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "# Criar Datasets conforme Cursos Superiores (E-MEC)",
   "id": "9f059a4200f52692"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-05-04T21:39:05.045246Z",
     "start_time": "2025-05-04T21:39:04.493107Z"
    }
   },
   "cell_type": "code",
   "source": [
    "dataframes = []\n",
    "for campus, df_cursos_sup in subdf_cursos_sup.items():\n",
    "    dt_min = pd.to_datetime(subdf_consumo_diarios[campus][\"DATA\"]).min()\n",
    "    dt_max = pd.to_datetime(subdf_consumo_diarios[campus][\"DATA\"]).max()\n",
    "\n",
    "    datas = pd.date_range(start=pd.to_datetime(df_cursos_sup[\"Início Funcionamento\"], format='%d/%m/%Y').min(),\n",
    "                          end=datetime.date.today())\n",
    "    df = pd.DataFrame({'DATA': datas})\n",
    "    df[\"CAMPUS\"] = campus\n",
    "    df[\"CURSOS_GRAD_MATUTINO\"] = 0\n",
    "    df[\"CURSOS_GRAD_VESPERTINO\"] = 0\n",
    "    df[\"CURSOS_GRAD_NOTURNO\"] = 0\n",
    "    df[\"CURSOS_POS\"] = 0\n",
    "\n",
    "    for index, row in df_cursos_sup.iterrows():\n",
    "        dt_inicio = pd.to_datetime(row[\"Início Funcionamento\"], format='%d/%m/%Y')\n",
    "        if row[\"Fim Funcionamento\"] is not None and not pd.isnull(row[\"Fim Funcionamento\"]):\n",
    "            dt_fim = pd.to_datetime(row[\"Fim Funcionamento\"], format='%d/%m/%Y')\n",
    "        else:\n",
    "            dt_fim = pd.to_datetime(datetime.date.today())\n",
    "\n",
    "        if row[\"Grau\"] in [\"Bacharelado\", \"Tecnológico\", \"Licenciatura\"]:\n",
    "            if row[\"Quantitativo de Vagas - Integral\"] > 0:\n",
    "                df.loc[(df['DATA'] >= pd.to_datetime(dt_inicio)) &\n",
    "                       (df['DATA'] <= pd.to_datetime(dt_fim)), \"CURSOS_GRAD_MATUTINO\"] += 1\n",
    "                df.loc[(df['DATA'] >= pd.to_datetime(dt_inicio)) &\n",
    "                       (df['DATA'] <= pd.to_datetime(dt_fim)), \"CURSOS_GRAD_VESPERTINO\"] += 1\n",
    "            else:\n",
    "                if row[\"Quantitativo de Vagas - Matutino\"] > 0:\n",
    "                    df.loc[(df['DATA'] >= pd.to_datetime(dt_inicio)) &\n",
    "                           (df['DATA'] <= pd.to_datetime(dt_fim)), \"CURSOS_GRAD_MATUTINO\"] += 1\n",
    "\n",
    "                if row[\"Quantitativo de Vagas - Vespertino\"] > 0:\n",
    "                    df.loc[(df['DATA'] >= pd.to_datetime(dt_inicio)) &\n",
    "                           (df['DATA'] <= pd.to_datetime(dt_fim)), \"CURSOS_GRAD_VESPERTINO\"] += 1\n",
    "\n",
    "            if row[\"Quantitativo de Vagas - Noturno\"] > 0:\n",
    "                df.loc[(df['DATA'] >= pd.to_datetime(dt_inicio)) &\n",
    "                       (df['DATA'] <= pd.to_datetime(dt_fim)), \"CURSOS_GRAD_NOTURNO\"] += 1\n",
    "\n",
    "        elif row[\"Grau\"] in [\"Especialização\", \"Mestrado\"]:\n",
    "            df.loc[(df['DATA'] >= pd.to_datetime(dt_inicio)) &\n",
    "                   (df['DATA'] <= pd.to_datetime(dt_fim)), \"CURSOS_POS\"] += 1\n",
    "\n",
    "    df = df.fillna(0)\n",
    "    df = df.loc[(df['DATA'] >= dt_min) & (df['DATA'] <= dt_max)]\n",
    "    df.to_csv(f\"./dados/cursos_superiores/{campus}.csv\", index=False, sep=\";\", decimal=\".\")\n",
    "    dataframes.append(df)\n",
    "\n",
    "# Concatenar todos os DataFrames em um único DataFrame\n",
    "df_completo = pd.concat(dataframes, ignore_index=True)\n",
    "df_completo = df_completo.fillna(0)\n",
    "\n",
    "df_completo.to_csv(\"./dados/dados_cursos_superiores.csv\", sep=\";\", decimal=\".\", index=False)\n"
   ],
   "id": "fd5b0245a9347095",
   "outputs": [],
   "execution_count": 8
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "# Criar Datasets conforme Cursos Técnicos (SISTEC)",
   "id": "b185eaa3bced20dd"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-05-04T21:39:05.859725Z",
     "start_time": "2025-05-04T21:39:05.104092Z"
    }
   },
   "cell_type": "code",
   "source": [
    "dataframes = []\n",
    "for campus, df_cursos_tec in subdf_cursos_tec.items():\n",
    "    dt_min = pd.to_datetime(subdf_consumo_diarios[campus][\"DATA\"]).min()\n",
    "    dt_max = pd.to_datetime(subdf_consumo_diarios[campus][\"DATA\"]).max()\n",
    "\n",
    "    datas = pd.date_range(start=pd.to_datetime(df_cursos_tec[\"DATA INICIO\"], format='%d/%m/%Y').min(),\n",
    "                          end=datetime.date.today())\n",
    "    df = pd.DataFrame({'DATA': datas})\n",
    "    df[\"CAMPUS\"] = campus\n",
    "    df[\"CURSOS_TEC_CONCOMITANTE\"] = 0\n",
    "    df[\"CURSOS_TEC_INTEGRADO\"] = 0\n",
    "    df[\"CURSOS_TEC_SUBSEQUENTE\"] = 0\n",
    "\n",
    "    for index, row in df_cursos_tec.iterrows():\n",
    "        dt_inicio = pd.to_datetime(row[\"DATA INICIO\"], format='%d/%m/%Y')\n",
    "        dt_fim = pd.to_datetime(row[\"DATA FIM\"], format='%d/%m/%Y')\n",
    "\n",
    "        if row[\"NO_TIPO_OFERTA\"] == \"CONCOMITANTE\":\n",
    "            df.loc[(df['DATA'] >= pd.to_datetime(dt_inicio)) & (\n",
    "                    df['DATA'] <= pd.to_datetime(dt_fim)), \"CURSOS_TEC_CONCOMITANTE\"] += 1\n",
    "        elif row[\"NO_TIPO_OFERTA\"] == \"INTEGRADO\":\n",
    "            df.loc[(df['DATA'] >= pd.to_datetime(dt_inicio)) & (\n",
    "                    df['DATA'] <= pd.to_datetime(dt_fim)), \"CURSOS_TEC_INTEGRADO\"] += 1\n",
    "        elif row[\"NO_TIPO_OFERTA\"] == \"SUBSEQUENTE\":\n",
    "            df.loc[(df['DATA'] >= pd.to_datetime(dt_inicio)) & (\n",
    "                    df['DATA'] <= pd.to_datetime(dt_fim)), \"CURSOS_TEC_SUBSEQUENTE\"] += 1\n",
    "\n",
    "    df = df.fillna(0)\n",
    "    df = df.loc[(df['DATA'] >= dt_min) & (df['DATA'] <= dt_max)]\n",
    "    df.to_csv(f\"./dados/cursos_tecnicos/{campus}.csv\", index=False, sep=\";\", decimal=\".\")\n",
    "    dataframes.append(df)\n",
    "\n",
    "# Concatenar todos os DataFrames em um único DataFrame\n",
    "df_completo = pd.concat(dataframes, ignore_index=True)\n",
    "df_completo = df_completo.fillna(0)\n",
    "\n",
    "df_completo.to_csv(\"./dados/dados_cursos_tecnicos.csv\", sep=\";\", decimal=\".\", index=False)\n"
   ],
   "id": "50656627d8195a81",
   "outputs": [],
   "execution_count": 9
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "# Mesclar Datasets",
   "id": "3c748f7da1d4b84"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-05-04T21:39:44.806234Z",
     "start_time": "2025-05-04T21:39:05.918840Z"
    }
   },
   "cell_type": "code",
   "source": [
    "dados_consumo = pd.read_csv(\"./dados/dados_consumo_diario.csv\", header=0, sep=\";\", decimal=\".\")\n",
    "dados_climaticos = pd.read_csv(\"./dados/dados_climaticos.csv\", header=0, sep=\";\", decimal=\".\")\n",
    "dados_temporais = pd.read_csv(\"./dados/dados_temporais.csv\", header=0, sep=\";\", decimal=\".\")\n",
    "dados_cursos_tecnicos = pd.read_csv(\"./dados/dados_cursos_tecnicos.csv\", header=0, sep=\";\", decimal=\".\")\n",
    "dados_cursos_superiores = pd.read_csv(\"./dados/dados_cursos_superiores.csv\", header=0, sep=\";\", decimal=\".\")\n",
    "\n",
    "dados_consumo[\"DATA\"] = pd.to_datetime(dados_consumo[\"DATA\"])\n",
    "dados_climaticos[\"DATA\"] = pd.to_datetime(dados_climaticos[\"DATA\"])\n",
    "dados_temporais[\"DATA\"] = pd.to_datetime(dados_temporais[\"DATA\"])\n",
    "dados_cursos_tecnicos[\"DATA\"] = pd.to_datetime(dados_cursos_tecnicos[\"DATA\"])\n",
    "dados_cursos_superiores[\"DATA\"] = pd.to_datetime(dados_cursos_superiores[\"DATA\"])\n",
    "\n",
    "df_merged = (dados_consumo\n",
    "             .merge(dados_climaticos, on=['DATA', \"CAMPUS\"], how='left')\n",
    "             .merge(dados_temporais, on=['DATA', \"CAMPUS\"], how='left')\n",
    "             .merge(dados_cursos_tecnicos, on=['DATA', \"CAMPUS\"], how='left')\n",
    "             .merge(dados_cursos_superiores, on=['DATA', \"CAMPUS\"], how='left'))\n",
    "\n",
    "df_merged[dados_cursos_superiores.columns] = df_merged[dados_cursos_superiores.columns].fillna(0)\n",
    "df_merged[dados_cursos_tecnicos.columns] = df_merged[dados_cursos_tecnicos.columns].fillna(0)\n",
    "\n",
    "for index, row in df_merged[df_merged.isnull().any(axis=1)].iterrows():\n",
    "    mes = pd.to_datetime(row[\"DATA\"]).month\n",
    "    campus = row[\"CAMPUS\"]\n",
    "    for col in df_merged.columns:\n",
    "        if pd.isnull(row[col]) and col in dados_climaticos.columns:\n",
    "            media_mes = df_merged[(df_merged[\"DATA\"].dt.month == mes) &\n",
    "                                  (df_merged[\"CAMPUS\"] == campus)][col].mean()\n",
    "            df_merged.at[index, col] = media_mes\n",
    "\n",
    "for col in dados_climaticos.drop(['DATA', 'CAMPUS'], axis=1).columns:\n",
    "    for suffix in ['MIN', 'MÉD', 'MAX', 'ACC']:\n",
    "        df_merged[f'{col}_{suffix}_MENS'] = df_merged[col]\n",
    "    df_merged.drop(col, axis=1, inplace=True)\n",
    "\n",
    "df_merged = df_merged.groupby('CAMPUS')\n",
    "\n",
    "regioes = {\n",
    "    'CAMPO LARGO': \"REGIÃO LESTE\",\n",
    "    # 'COLOMBO': \"REGIÃO LESTE\",\n",
    "    'CURITIBA': \"REGIÃO LESTE\",\n",
    "    'PARANAGUÁ': \"REGIÃO LESTE\",\n",
    "    'PINHAIS': \"REGIÃO LESTE\",\n",
    "    'CAPANEMA': \"REGIÃO SUL\",\n",
    "    'CORONEL VIVIDA': \"REGIÃO SUL\",\n",
    "    'PALMAS': \"REGIÃO SUL\",\n",
    "    'BARRACÃO': \"REGIÃO SUL\",\n",
    "    'UNIÃO DA VITÓRIA': \"REGIÃO SUL\",\n",
    "    'ASSIS CHATEAUBRIAND': \"REGIÃO OESTE\",\n",
    "    'CASCAVEL': \"REGIÃO OESTE\",\n",
    "    'FOZ DO IGUAÇU': \"REGIÃO OESTE\",\n",
    "    'GOIOERÊ': 'REGIÃO OESTE',\n",
    "    'QUEDAS DO IGUAÇU': \"REGIÃO OESTE\",\n",
    "    'UMUARAMA': 'REGIÃO OESTE',\n",
    "    # 'ARAPONGAS': \"REGIÃO NORTE\",\n",
    "    'ASTORGA': \"REGIÃO NORTE\",\n",
    "    'JACAREZINHO': \"REGIÃO NORTE\",\n",
    "    'LONDRINA - CENTRO': \"REGIÃO NORTE\",\n",
    "    # 'LONDRINA - NORTE': \"REGIÃO NORTE\",\n",
    "    'PARANAVAÍ': 'REGIÃO NORTE',\n",
    "    'IRATI': \"REGIÃO CENTRAL\",\n",
    "    'IVAIPORÃ': \"REGIÃO CENTRAL\",\n",
    "    'JAGUARIAÍVA': \"REGIÃO CENTRAL\",\n",
    "    'PITANGA': \"REGIÃO CENTRAL\",\n",
    "    'TELÊMACO BORBA': \"REGIÃO CENTRAL\",\n",
    "}\n",
    "\n",
    "dataframes = []\n",
    "for campus, df in df_merged:\n",
    "    df[\"PERIODO\"] = df[\"DATA\"]\n",
    "    df = df.groupby(pd.Grouper(key=\"PERIODO\", freq=\"ME\")).agg({\n",
    "        'CONSUMO': 'sum',\n",
    "        'DATA': \"last\",\n",
    "        **{col: 'mean' for col in df.filter(like='_MÉD_').columns},\n",
    "        **{col: 'min' for col in df.filter(like='_MIN_').columns},\n",
    "        **{col: 'max' for col in df.filter(like='_MAX_').columns},\n",
    "        **{col: 'sum' for col in df.filter(like='_ACC_').columns},\n",
    "        **{col: 'sum' for col in df.filter(like='DIA_').columns},\n",
    "        **{col: 'last' for col in df.filter(like='MÊS').columns},\n",
    "        **{col: 'last' for col in df.filter(like='ANO').columns},\n",
    "        **{col: 'last' for col in df.filter(like='CAMPUS_').columns},\n",
    "        **{col: 'mean' for col in df.filter(like='CURSOS_').columns},\n",
    "        **{col: 'sum' for col in ['FÉRIAS', 'FERIADO', 'COVID', 'GREVE']},\n",
    "    })\n",
    "    df[\"CAMPUS\"] = campus\n",
    "    df[\"REGIÃO\"] = regioes.get(campus)\n",
    "\n",
    "    df[df.filter(like='_MENS').columns] = round(df[df.filter(like='_MENS').columns])\n",
    "    df[\"CONSUMO\"] = round(df[\"CONSUMO\"])\n",
    "    dataframes.append(df)\n",
    "    df[\"ORDEM\"] = range(1, len(df) + 1)\n",
    "    df[['CONSUMO', \"CAMPUS\", 'REGIÃO', \"ORDEM\"]].to_csv(f\"./dados/series/{campus}.csv\", sep=\";\", decimal=\".\")\n",
    "\n",
    "# Concatenar todos os DataFrames em um único DataFrame\n",
    "df_completo = pd.concat(dataframes, ignore_index=True)\n",
    "df_completo.to_csv(\"./dados/dados_mesclados.csv\", sep=\";\", decimal=\".\", index=False)\n"
   ],
   "id": "f890b71544644357",
   "outputs": [],
   "execution_count": 10
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
